import pandas as pdimport plotly.express as pximport plotly.io as piofrom pyspark.sql import SparkSessionimport reimport numpy as npimport plotly.graph_objects as gofrom pyspark.sql.functions import col, split, explode, regexp_replace, transform, whenfrom pyspark.sql import functions as Ffrom pyspark.sql.functions import col, monotonically_increasing_idnp.random.seed(51)pio.renderers.default ="notebook"# Initialize Spark Sessionspark = SparkSession.builder.appName("LightcastData").getOrCreate()# Load Datadf = spark.read.option("header", "true").option("inferSchema", "true").option("multiLine","true").option("escape", "\"").csv("data/lightcast_job_postings.csv")df.createOrReplaceTempView("job_postings")# Show Schema and Sample Data#print("---This is Diagnostic check, No need to print it in the final doc---")#df.printSchema() # comment this line when rendering the submission#df.show(5)
WARNING: Using incubator modules: jdk.incubator.vector
Using Spark's default log4j profile: org/apache/spark/log4j2-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/09/23 01:28:10 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
[Stage 1:> (0 + 1) / 1] 25/09/23 01:28:24 WARN SparkStringUtils: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.
fig = px.box( pdf, x="NAICS2_NAME", y="SALARY", title="Salary Distribution by Industry", color_discrete_sequence=["purple"], points="outliers",)fig.update_layout( font_family="Times New Roman", title_font_size=16, xaxis_title="Industry", yaxis_title="Salary", xaxis_tickangle=45,)fig.show()fig.write_html("Q1a.html")#fig.write_image("Q1a.png")
#Analysis:
#Question 1b - Salary Distribution by Employment Type
fig = px.box( pdf, x="EMPLOYMENT_TYPE_NAME", y="SALARY", title="Salary Distribution by Employment Type", color_discrete_sequence=["orange"], points="outliers",)fig.update_layout( font_family="Times New Roman", title_font_size=16, xaxis_title="Employment Type", yaxis_title="Salary", xaxis_tickangle=45,)fig.show()fig.write_html("Q1b.html")#fig.write_image("Q1b.png")
#Analysis:
#Question 2 - Salary Analysis by ONET Occupation Type
#aggregate data by median salary for each occupation (saonet)saonet = pdf.groupby("LOT_OCCUPATION_NAME").agg( median_salary=("SALARY", "median"), # median salary job_count=("SALARY", "count") # number of postings).reset_index()fig = px.scatter( saonet, x="LOT_OCCUPATION_NAME", y="median_salary", size="job_count", size_max=60, color_continuous_scale=["plasma"], title="Salary Analysis by Occupation")fig.update_layout( font_family="Times New Roman", title_font_size=16, xaxis_title="Occupation", yaxis_title="Median Salary", xaxis_tickangle=45,)fig.show()fig.write_html("Q2.html")#fig.write_image("Q2.png")